Loading in the libraries for regression.


In [1]:
# Old libraries that we know and love.
import numpy as np
import matplotlib.pylab as py
import pandas as pa
%matplotlib inline

# Our new libraries.
from sklearn import cross_validation, linear_model, feature_selection, metrics
import mayavi.mlab as mlab


WARNING:traits.has_traits:DEPRECATED: traits.has_traits.wrapped_class, 'the 'implements' class advisor has been deprecated. Use the 'provides' class decorator.

Supervised Regression

Linear Regression


In [4]:
# Read in the data using 
Xy = pa.read_csv('Advertising.csv')

In [5]:
# Take a look at the contents.
Xy


Out[5]:
Unnamed: 0 TV Radio Newspaper Sales
0 1 230.1 37.8 69.2 22.1
1 2 44.5 39.3 45.1 10.4
2 3 17.2 45.9 69.3 9.3
3 4 151.5 41.3 58.5 18.5
4 5 180.8 10.8 58.4 12.9
5 6 8.7 48.9 75.0 7.2
6 7 57.5 32.8 23.5 11.8
7 8 120.2 19.6 11.6 13.2
8 9 8.6 2.1 1.0 4.8
9 10 199.8 2.6 21.2 10.6
10 11 66.1 5.8 24.2 8.6
11 12 214.7 24.0 4.0 17.4
12 13 23.8 35.1 65.9 9.2
13 14 97.5 7.6 7.2 9.7
14 15 204.1 32.9 46.0 19.0
15 16 195.4 47.7 52.9 22.4
16 17 67.8 36.6 114.0 12.5
17 18 281.4 39.6 55.8 24.4
18 19 69.2 20.5 18.3 11.3
19 20 147.3 23.9 19.1 14.6
20 21 218.4 27.7 53.4 18.0
21 22 237.4 5.1 23.5 12.5
22 23 13.2 15.9 49.6 5.6
23 24 228.3 16.9 26.2 15.5
24 25 62.3 12.6 18.3 9.7
25 26 262.9 3.5 19.5 12.0
26 27 142.9 29.3 12.6 15.0
27 28 240.1 16.7 22.9 15.9
28 29 248.8 27.1 22.9 18.9
29 30 70.6 16.0 40.8 10.5
... ... ... ... ... ...
170 171 50.0 11.6 18.4 8.4
171 172 164.5 20.9 47.4 14.5
172 173 19.6 20.1 17.0 7.6
173 174 168.4 7.1 12.8 11.7
174 175 222.4 3.4 13.1 11.5
175 176 276.9 48.9 41.8 27.0
176 177 248.4 30.2 20.3 20.2
177 178 170.2 7.8 35.2 11.7
178 179 276.7 2.3 23.7 11.8
179 180 165.6 10.0 17.6 12.6
180 181 156.6 2.6 8.3 10.5
181 182 218.5 5.4 27.4 12.2
182 183 56.2 5.7 29.7 8.7
183 184 287.6 43.0 71.8 26.2
184 185 253.8 21.3 30.0 17.6
185 186 205.0 45.1 19.6 22.6
186 187 139.5 2.1 26.6 10.3
187 188 191.1 28.7 18.2 17.3
188 189 286.0 13.9 3.7 15.9
189 190 18.7 12.1 23.4 6.7
190 191 39.5 41.1 5.8 10.8
191 192 75.5 10.8 6.0 9.9
192 193 17.2 4.1 31.6 5.9
193 194 166.8 42.0 3.6 19.6
194 195 149.7 35.6 6.0 17.3
195 196 38.2 3.7 13.8 7.6
196 197 94.2 4.9 8.1 9.7
197 198 177.0 9.3 6.4 12.8
198 199 283.6 42.0 66.2 25.5
199 200 232.1 8.6 8.7 13.4

200 rows × 5 columns


In [6]:
# Normalize data
# We do this to make plotting and processing easier.  Many Sklearn functions do this
# for you behind the scenes, but we do it explicitly.
# Note, that this is a cousing of the physics idea of nondimensionalization.  Think
# about the case where TV was measured in millions, while Radio was measured in
# thousands.  One could imagine TV totally washing out the effect of Radio.
# In effect, after normalization, each predictor now stands on an "even footing".
#
# Is this always a good idea?
Xy = (Xy-Xy.min())/(Xy.max()-Xy.min())
Xy


Out[6]:
Unnamed: 0 TV Radio Newspaper Sales
0 0.000000 0.775786 0.762097 0.605981 0.807087
1 0.005025 0.148123 0.792339 0.394019 0.346457
2 0.010050 0.055800 0.925403 0.606860 0.303150
3 0.015075 0.509976 0.832661 0.511873 0.665354
4 0.020101 0.609063 0.217742 0.510994 0.444882
5 0.025126 0.027054 0.985887 0.656992 0.220472
6 0.030151 0.192087 0.661290 0.204046 0.401575
7 0.035176 0.404126 0.395161 0.099384 0.456693
8 0.040201 0.026716 0.042339 0.006157 0.125984
9 0.045226 0.673318 0.052419 0.183817 0.354331
10 0.050251 0.221170 0.116935 0.210202 0.275591
11 0.055276 0.723706 0.483871 0.032542 0.622047
12 0.060302 0.078120 0.707661 0.576957 0.299213
13 0.065327 0.327359 0.153226 0.060686 0.318898
14 0.070352 0.687859 0.663306 0.401935 0.685039
15 0.075377 0.658438 0.961694 0.462621 0.818898
16 0.080402 0.226919 0.737903 1.000000 0.429134
17 0.085427 0.949273 0.798387 0.488127 0.897638
18 0.090452 0.231654 0.413306 0.158311 0.381890
19 0.095477 0.495773 0.481855 0.165347 0.511811
20 0.100503 0.736219 0.558468 0.467018 0.645669
21 0.105528 0.800473 0.102823 0.204046 0.429134
22 0.110553 0.042273 0.320565 0.433597 0.157480
23 0.115578 0.769699 0.340726 0.227792 0.547244
24 0.120603 0.208319 0.254032 0.158311 0.318898
25 0.125628 0.886710 0.070565 0.168865 0.409449
26 0.130653 0.480893 0.590726 0.108179 0.527559
27 0.135678 0.809604 0.336694 0.198769 0.562992
28 0.140704 0.839026 0.546371 0.198769 0.681102
29 0.145729 0.236388 0.322581 0.356201 0.350394
... ... ... ... ... ...
170 0.854271 0.166723 0.233871 0.159191 0.267717
171 0.859296 0.553940 0.421371 0.414248 0.507874
172 0.864322 0.063916 0.405242 0.146878 0.236220
173 0.869347 0.567129 0.143145 0.109938 0.397638
174 0.874372 0.749746 0.068548 0.112577 0.389764
175 0.879397 0.934055 0.985887 0.364996 1.000000
176 0.884422 0.837673 0.608871 0.175901 0.732283
177 0.889447 0.573216 0.157258 0.306948 0.397638
178 0.894472 0.933378 0.046371 0.205805 0.401575
179 0.899497 0.557660 0.201613 0.152155 0.433071
180 0.904523 0.527224 0.052419 0.070361 0.350394
181 0.909548 0.736557 0.108871 0.238347 0.417323
182 0.914573 0.187690 0.114919 0.258575 0.279528
183 0.919598 0.970240 0.866935 0.628848 0.968504
184 0.924623 0.855935 0.429435 0.261214 0.629921
185 0.929648 0.690903 0.909274 0.169745 0.826772
186 0.934673 0.469395 0.042339 0.231310 0.342520
187 0.939698 0.643896 0.578629 0.157432 0.618110
188 0.944724 0.964829 0.280242 0.029903 0.562992
189 0.949749 0.060873 0.243952 0.203166 0.200787
190 0.954774 0.131214 0.828629 0.048373 0.362205
191 0.959799 0.252959 0.217742 0.050132 0.326772
192 0.964824 0.055800 0.082661 0.275286 0.169291
193 0.969849 0.561718 0.846774 0.029024 0.708661
194 0.974874 0.503889 0.717742 0.050132 0.618110
195 0.979899 0.126818 0.074597 0.118734 0.236220
196 0.984925 0.316199 0.098790 0.068602 0.318898
197 0.989950 0.596212 0.187500 0.053650 0.440945
198 0.994975 0.956713 0.846774 0.579595 0.940945
199 1.000000 0.782550 0.173387 0.073879 0.464567

200 rows × 5 columns


In [7]:
# Select out our predictor columns and our response columns
X = Xy.ix[:,['TV']]
y = Xy.ix[:,['Sales']]

In [24]:
# Last time we did this by hand, now we are smarter and use the sklearn 
# routine.  This routine splits data into training and testing subsets.
cross_validation.train_test_split([1,2,3,4,5],
                                  [6,7,8,9,10],
                                  test_size=0.4,
                                  random_state=5)


Out[24]:
[[2, 3, 4], [5, 1], [7, 8, 9], [10, 6]]

In [25]:
# Now we do it for the real data.
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,
                                                                  y,
                                                                  test_size=0.8)

In [26]:
# Let's take a quick look at the data.
X_train


Out[26]:
TV
52 0.729456
119 0.063240
84 0.719648
173 0.567129
103 0.633074
13 0.327359
58 0.710517
18 0.231654
142 0.743321
82 0.252283
192 0.055800
16 0.226919
105 0.463984
133 0.740954
124 0.773757
1 0.148123
177 0.573216
126 0.024011
49 0.223876
187 0.643896
178 0.933378
59 0.710179
102 0.945215
155 0.011498
41 0.596212
91 0.094352
92 0.733852
20 0.736219
167 0.696990
186 0.469395
63 0.344944
17 0.949273
144 0.322962
150 0.946906
147 0.820088
33 0.895840
145 0.472100
199 0.782550
151 0.406831
164 0.393980

In [27]:
# Run the solver
reg = linear_model.LinearRegression(fit_intercept=True)
reg.fit(X_train,y_train)


Out[27]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [28]:
# There are the slope and intercept of the line we computed.
# Beta_0
print reg.intercept_
# Beta_1
print reg.coef_


[ 0.19788723]
[[ 0.57698067]]

In [29]:
# Do a plot
plotX = np.linspace(0,1,100)
plotY = reg.predict(np.matrix(plotX).T)
py.plot(X_train,y_train,'ro')
py.plot(X_test,y_test,'go')
py.plot(plotX,plotY,'b-')


Out[29]:
[<matplotlib.lines.Line2D at 0x173132b0>]

In [30]:
# Use the metrics package to print our errors.  See discussion on slides.
print 'training error'
print metrics.mean_squared_error(y_train,reg.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,reg.predict(X_test))


training error
0.0181351494343
testing error
0.0159325184683

Back to slides.

Multi-dimensional regression


In [31]:
# Select out our predictor columns and our response columns
X = Xy.ix[:,['TV','Radio']]
y = Xy.ix[:,['Sales']]

# Select subsets for training and testing
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,
                                                                  y,
                                                                  test_size=0.8,
                                                                  random_state=123)

In [32]:
# Plot the data to get a feel for it.
mlab.clf()
mlab.points3d(X_train.ix[:,0]/X.ix[:,0].std(), 
              X_train.ix[:,1]/X.ix[:,1].std(), 
              y_train.ix[:,0]/y.ix[:,0].std(),
              color=(1,0,0), scale_factor=0.2)
mlab.points3d(X_test.ix[:,0]/X.ix[:,0].std(), 
              X_test.ix[:,1]/X.ix[:,1].std(), 
              y_test.ix[:,0]/y.ix[:,0].std(),
              color=(0,1,0), scale_factor=0.2)
mlab.axes()
mlab.show()

In [33]:
# Run the solver
reg = linear_model.LinearRegression(fit_intercept=True)
reg.fit(X_train,y_train)


Out[33]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [34]:
# Create data for plotting
size=10
xPlot,yPlot = np.meshgrid(np.linspace(0,1,size),
                          np.linspace(0,1,size))
np.array([xPlot.flatten(),yPlot.flatten()])
zPlot = reg.predict(np.transpose(np.array([xPlot.flatten(),
                                           yPlot.flatten()])))
zPlot = zPlot.reshape([size,size])

In [35]:
# Since we will be plotting many times, we 
def myPlot(reg,X_train,y_train,X_test,y_test,xPlot,yPlot,zPlot,size=10,scale_factor=0.05):
    mlab.clf()
    mlab.points3d(X_train.ix[:,0], 
                  X_train.ix[:,1], 
                  y_train.ix[:,0],
                  color=(1,0,0), scale_factor=scale_factor)
    mlab.points3d(X_test.ix[:,0], 
                  X_test.ix[:,1], 
                  y_test.ix[:,0],
                  color=(0,1,0), scale_factor=scale_factor)
    mlab.mesh(xPlot,yPlot,zPlot,color=(0,0,1))
    mlab.axes()
    mlab.show()
    
myPlot(reg,X_train,y_train,X_test,y_test,xPlot,yPlot,zPlot)

In [36]:
# Use the metrics package to print our errors
print 'training error'
print metrics.mean_squared_error(y_train,reg.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,reg.predict(X_test))


training error
0.00301813848558
testing error
0.00529973748665

Back to the notes.


In [ ]:


In [ ]:


In [ ]: